In [4]:
# Polynomial Regression (application to the US population)
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
In [6]:
# Load the data
data = pd.read_csv("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data\\USpop.csv")
print(data.columns)
Index(['Year', 'Population'], dtype='object')
In [8]:
# Plot the data
plt.scatter(data['Year'], data['Population'])
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()
In [10]:
# LINEAR MODEL
# Define the linear model
X_lin = sm.add_constant(data['Year']) # Adds a constant term to the predictor
lin_model = sm.OLS(data['Population'], X_lin).fit()
# Summary of the linear model
print(lin_model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Population R-squared: 0.921 Model: OLS Adj. R-squared: 0.918 Method: Least Squares F-statistic: 257.9 Date: Thu, 01 Aug 2024 Prob (F-statistic): 1.24e-13 Time: 16:29:56 Log-Likelihood: -114.70 No. Observations: 24 AIC: 233.4 Df Residuals: 22 BIC: 235.8 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -2600.2970 169.096 -15.378 0.000 -2950.980 -2249.614 Year 1.4245 0.089 16.059 0.000 1.241 1.609 ============================================================================== Omnibus: 3.921 Durbin-Watson: 0.096 Prob(Omnibus): 0.141 Jarque-Bera (JB): 2.503 Skew: 0.597 Prob(JB): 0.286 Kurtosis: 1.962 Cond. No. 5.25e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.25e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [12]:
# Plot the linear fit
plt.scatter(data['Year'], data['Population'])
plt.plot(data['Year'], lin_model.predict(X_lin), color='red', linewidth=3)
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()
In [14]:
# Clearly, the linear model is too inflexible and restrictive, it does not provide a good fit.
# This is underfitting. Notice, however, that R2 in this regression is 0.9193. Without looking
# at the plot, we could have assumed that the model is very good!
# Prediction for the year 2030 using the linear model
pred_lin_2030 = lin_model.predict([1, 2030])[0]
print(f"Linear model prediction for 2030: {pred_lin_2030}")
Linear model prediction for 2030: 291.5173913043477
In [16]:
# This is obviously a poor prediction. The US population was already 331 million during the most
# recent Census. So, we probably omitted an important predictor. Residual plots will help us
# determine which one.
# Let’s produce various related plots. Partition the graphics window into 4 parts and use “plot”.
# Residual plots
fig = sm.graphics.plot_regress_exog(lin_model, 'Year')
plt.show()
In [18]:
# The first plot shows that a quadratic term has been omitted although
# it is important in the population growth. So, fit a quadratic model.
# QUADRATIC MODEL
# Define the quadratic model
data['Year_sq'] = data['Year'] ** 2
X_quadr = sm.add_constant(data[['Year', 'Year_sq']])
quadr_model = sm.OLS(data['Population'], X_quadr).fit()
In [20]:
# Summary of the quadratic model
print(quadr_model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Population R-squared: 0.999 Model: OLS Adj. R-squared: 0.999 Method: Least Squares F-statistic: 1.389e+04 Date: Thu, 01 Aug 2024 Prob (F-statistic): 1.67e-33 Time: 16:31:20 Log-Likelihood: -58.969 No. Observations: 24 AIC: 123.9 Df Residuals: 21 BIC: 127.5 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 2.17e+04 522.731 41.515 0.000 2.06e+04 2.28e+04 Year -24.1223 0.549 -43.914 0.000 -25.265 -22.980 Year_sq 0.0067 0.000 46.514 0.000 0.006 0.007 ============================================================================== Omnibus: 10.491 Durbin-Watson: 1.284 Prob(Omnibus): 0.005 Jarque-Bera (JB): 8.556 Skew: -1.219 Prob(JB): 0.0139 Kurtosis: 4.616 Cond. No. 3.09e+09 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 3.09e+09. This might indicate that there are strong multicollinearity or other numerical problems.
In [22]:
# A higher R-squared is not surprising. It will always increase when we add
# new variables to the model. The fair criterion is Adjusted R-squared, when
# we compare models with a different number of parameters. Quadratic model has
# Adjusted R-squared = 0.999 comparing with 0.9155 for the linear model.
# Now let’s obtain the fitted values and plot the fitted curve.
# Plot the quadratic fit
plt.scatter(data['Year'], data['Population'])
plt.plot(data['Year'], quadr_model.predict(X_quadr), color='blue', linewidth=3)
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()
In [24]:
# The quadratic curve fits nearly perfectly. The quadratic growth slowed down only during
# the World War II, although Baby Boomers quickly caught up with the trend.
# Prediction for the year 2030 using the quadratic model
pred_quadr_2030 = quadr_model.predict([1, 2030, 2030 ** 2])[0]
print(f"Quadratic model prediction for 2030: {pred_quadr_2030}")
# Confidence interval for the prediction
pred_quadr_2030_conf = quadr_model.get_prediction([1, 2030, 2030 ** 2]).conf_int()
print(f"Confidence interval for the prediction for 2030: {pred_quadr_2030_conf}")
# Prediction interval for the prediction
pred_quadr_2030_pred = quadr_model.get_prediction([1, 2030, 2030 ** 2]).summary_frame(alpha=0.05)
print(f"Prediction interval for the prediction for 2030: {pred_quadr_2030_pred}")
Quadratic model prediction for 2030: 364.1572134257949 Confidence interval for the prediction for 2030: [[359.96856497 368.34586188]] Prediction interval for the prediction for 2030: mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower \ 0 364.157213 2.014147 359.968565 368.345862 356.610212 obs_ci_upper 0 371.704215
In [ ]:
# Food for thought... Are the confidence and predictions intervals valid here?